Importing data
#load L1000 files===============
DIR<-c("/xdisk/mpadi/jiawenyang/data/merkel_cell/pp_in_mcc_paper/l1000/")
#FILES<-list.files(path = paste0(DIR, "WNT_inhibitor"), pattern = "*_info.txt", full.names = T, recursive = T)
FILES<-list.files(path = paste0(DIR, "WNT_related_conditions"), pattern = "*_info.txt", full.names = T, recursive = T)
gene_info<-read.table(file = paste0(DIR, "GSE92742_Broad_LINCS_gene_info.txt"), sep = "\t", quote = "", header = T)
cell_info<-read.delim2(file = paste0(DIR, "GSE92742_Broad_LINCS_cell_info.txt"), sep = "\t", quote = "", header = T)
colnames<-read.table(file = paste0(DIR, "colnames.txt"))
#load gse39612 data set ====================
gse39612_renormalized_edat<-readRDS(file = "/xdisk/mpadi/jiawenyang/data/merkel_cell/pp_in_mcc_paper/gse39612/renormalized_samples_edat_no_celllines.RDS")
#load NE marker gene list ================
neuroendocrine_markers<-c("ENO2", "NEFM", "NEFH", "NMB", "HES6", "SOX2", "ATOH1", "CHGA")
Overlapping the NE1000
genes with perturbed top 1000 genes in LINCS L1000 data under different
perturbagens
n=500
gse39612_high_NE_score_genes<-names(NE_score_sorted[1:n])
gse39612_low_NE_score_genes<-names(rev(NE_score_sorted)[1:n])
annotate_genes<-list()
for (i in 1:length(FILES)){
#RDS_FILES<-list.files(path = paste0(DIR, "WNT_inhibitor"), pattern = "*.RDS", full.names = T, recursive = T)
RDS_FILES<-list.files(path = paste0(DIR, "WNT_related_conditions"), pattern = "*.RDS", full.names = T, recursive = T)
level4_all<-readRDS(RDS_FILES[i])
drug_lable<-strsplit(strsplit(RDS_FILES[i], "/")[[1]][10], "_")[[1]][1]
gene_info$pr_gene_id <- as.character(gene_info$pr_gene_id)
level4_matrix<-mat(level4_all)
level4_zscore<-as.data.frame(level4_matrix)
level4_zscore$pr_gene_id<-rownames(level4_zscore)
level4_zscore<-left_join(level4_zscore, gene_info, by = "pr_gene_id")
rownames(level4_zscore)<-level4_zscore$pr_gene_symbol
level4_zscore_analysis<-level4_zscore[, 1:(ncol(level4_zscore)-5)]
level4_zscore_analysis$mean<-apply(level4_zscore_analysis, 1, mean)
level4_zscore_analysis$gene<-rownames(level4_zscore_analysis)
#Organize l1000 level4 sample info data ============================
samples_info<-read.table(file = FILES[i] ,sep = "\t", header = F)
colnames(samples_info)<-colnames
group_all=samples_info[,c("sample_id" ,"pert_itime", "pert_idose", "cell_iname" , "project_code")]
rownames(group_all)<-samples_info$sample_id
group_all<-group_all[colnames(level4_zscore),]
group_all<-na.omit(group_all)
cell_info_df<-cell_info[,c("base_cell_id", "primary_site", "subtype", "original_growth_pattern")]
colnames(cell_info_df)<-c("cell_iname","primary_site", "subtype", "original_growth_pattern")
group_all_new<-left_join(group_all, cell_info_df, by = "cell_iname", keep = F)
group_all<-group_all_new[!duplicated(group_all_new$sample_id),]
group_all$dosage<-unlist(lapply(group_all$pert_idose, function(x) strsplit(x, " ")[[1]][1]))
group_all<-group_all[group_all$dosage <= 10,]
L1000_zscored_fc<-level4_zscore_analysis
L1000_sample_info<-group_all
all_fc_edat<-L1000_zscored_fc #total 12328 genes
df<-all_fc_edat
L1000_top_downregulated_genes_all<-all_fc_edat[order(all_fc_edat$mean, decreasing = F),][1:n,]
L1000_top_upregulated_genes_all<-all_fc_edat[order(all_fc_edat$mean, decreasing = T),][1:n,]
annotate_down<-intersect(gse39612_high_NE_score_genes, rownames(L1000_top_downregulated_genes_all))
annotate_up<-intersect(gse39612_low_NE_score_genes, rownames(L1000_top_upregulated_genes_all))
annotate_gene_set<-c(annotate_down, annotate_down)
annotate_genes[[drug_lable]]<-annotate_gene_set
print(annotate_gene_set)
annotate_down_df<-all_fc_edat[annotate_down,]
annotate_up_df<-all_fc_edat[annotate_up,]
#Calculate jaccard similarity coefficient =========
NE_set = data.frame(genes = c(gse39612_high_NE_score_genes, gse39612_low_NE_score_genes), binary = rep(0, length(c(gse39612_high_NE_score_genes, gse39612_low_NE_score_genes))))
NE_set[which(NE_set$genes %in% annotate_gene_set == TRUE), "binary"]<-1
L1000_set = data.frame(genes = c(rownames(L1000_top_downregulated_genes_all), rownames(L1000_top_upregulated_genes_all)),
binary = rep(0,length(c(rownames(L1000_top_downregulated_genes_all), rownames(L1000_top_upregulated_genes_all)))))
L1000_set[which(L1000_set$genes %in% annotate_gene_set == TRUE), "binary"]<-1
jaccard_test<-jaccard.test(NE_set$binary, L1000_set$binary, method = "bootstrap")
print(drug_lable)
print("Performing jaccard test with 'bootstrap' method")
print(jaccard_test$pvalue)
#plot for each drug ==========
df<-na.omit(df)
p1 = ggplot(data = df,
aes(x=reorder(gene, mean), y=mean),
label = gene)+
geom_point(color = "grey75",
size = 5
)+
geom_point(data = annotate_down_df , # New layer containing data subset il_genes
size = 3,
shape = 21,
fill = "firebrick",
colour = "black")+
geom_text_repel(data = annotate_down_df, aes(label = gene), size = 6, fontface = "bold", colour = "black", force = 3, max.overlaps = 70) +
geom_point(data = annotate_up_df , # New layer containing data subset il_genes
size = 3,
shape = 21,
fill = "blue",
colour = "black")+
geom_text_repel(data = annotate_up_df, aes(label = gene), size = 6, fontface = "bold", colour = "black", force = 3, max.overlaps = 70) +
ylab(paste0("Mean of Z-scored fold change of all cell lines"))+
xlab("Genes")+
scale_x_discrete(expand = c(0.15,0))+
lims(y = c(-7.5, 5))+
ggtitle(paste0("L1000 top dysregulated genes overlapping with GSE39612 MCC NE_1000 genes \n -", drug_lable))+
theme(axis.text.x = element_blank(),
plot.title = element_text(size = 30, face = "bold"),
panel.background = element_blank(),
axis.line.y = element_line(),
#plot.margin = unit(c(1,1,1,1), "cm"),
axis.title.x = element_text(size=30, face="bold"),
axis.title.y = element_text(size=30, face="bold"),
axis.text.y = element_text(size=30, face="bold"))
#pdf(file = paste0("/xdisk/mpadi/jiawenyang/result/merkel_cell/pp_in_mcc_paper/figure/L1000_top_ranked_overlapped_gene_in_", drug_lable,".pdf" ), height = 20, width = 24)
plot(p1)
#dev.off()
}
## [1] "UBE2S" "SLC20A1" "MCM2" "NETO2" "CDC45" "TIMELESS"
## [7] "AP3D1" "ESPL1" "MCM10" "EEF1A2" "CCNB1" "MPPED2"
## [13] "CCNB2" "TOP2A" "SNRPE" "RAD51C" "DHFR" "PCNA"
## [19] "SNRNP25" "CDC20" "TXNRD1" "B4GALT6" "CDC25B" "TTK"
## [25] "UBE2S" "SLC20A1" "MCM2" "NETO2" "CDC45" "TIMELESS"
## [31] "AP3D1" "ESPL1" "MCM10" "EEF1A2" "CCNB1" "MPPED2"
## [37] "CCNB2" "TOP2A" "SNRPE" "RAD51C" "DHFR" "PCNA"
## [43] "SNRNP25" "CDC20" "TXNRD1" "B4GALT6" "CDC25B" "TTK"
## [1] "CSNK1A1-OE"
## [1] "Performing jaccard test with 'bootstrap' method"
## [1] 0.432

## [1] "FEN1" "MCM6" "PARP1" "UBE2S" "PAFAH1B3" "MCM2"
## [7] "STMN1" "FANCI" "NETO2" "TYMS" "MTHFD2" "GINS2"
## [13] "DTL" "NCAPG" "HMGN2" "RRM2" "TRIP13" "NCAPG2"
## [19] "CDK5R1" "DEPDC1" "NEK2" "CDC25A" "FOXM1" "CDC6"
## [25] "BUB1" "LBR" "AURKA" "ZWINT" "DLGAP5" "GINS1"
## [31] "H2AFX" "MKI67" "KIF4A" "UBE2C" "KIF20A" "TRMT5"
## [37] "FBXO5" "DDX39A" "TBP" "MCM4" "CENPM" "SPAG5"
## [43] "CCNB1" "RFC4" "MCM7" "BIRC5" "EEF1E1" "CCNB2"
## [49] "MELK" "KIF18B" "TOP2A" "ORC6" "ASF1B" "CBX3"
## [55] "PRC1" "CDCA8" "SMC4" "XPO6" "MYBL1" "OIP5"
## [61] "CCNA2" "PTTG1" "SNRPE" "TMPO" "RAD54B" "NCAPH"
## [67] "PCNA" "URB2" "EZH2" "ASPM" "MCM5" "CDC20"
## [73] "SPC25" "TPX2" "CDC25B" "TTK" "FEN1" "MCM6"
## [79] "PARP1" "UBE2S" "PAFAH1B3" "MCM2" "STMN1" "FANCI"
## [85] "NETO2" "TYMS" "MTHFD2" "GINS2" "DTL" "NCAPG"
## [91] "HMGN2" "RRM2" "TRIP13" "NCAPG2" "CDK5R1" "DEPDC1"
## [97] "NEK2" "CDC25A" "FOXM1" "CDC6" "BUB1" "LBR"
## [103] "AURKA" "ZWINT" "DLGAP5" "GINS1" "H2AFX" "MKI67"
## [109] "KIF4A" "UBE2C" "KIF20A" "TRMT5" "FBXO5" "DDX39A"
## [115] "TBP" "MCM4" "CENPM" "SPAG5" "CCNB1" "RFC4"
## [121] "MCM7" "BIRC5" "EEF1E1" "CCNB2" "MELK" "KIF18B"
## [127] "TOP2A" "ORC6" "ASF1B" "CBX3" "PRC1" "CDCA8"
## [133] "SMC4" "XPO6" "MYBL1" "OIP5" "CCNA2" "PTTG1"
## [139] "SNRPE" "TMPO" "RAD54B" "NCAPH" "PCNA" "URB2"
## [145] "EZH2" "ASPM" "MCM5" "CDC20" "SPC25" "TPX2"
## [151] "CDC25B" "TTK"
## [1] "CTNNB1-KD"
## [1] "Performing jaccard test with 'bootstrap' method"
## [1] 0.048

## [1] "MPHOSPH9" "PAFAH1B3" "TYMS" "MTHFD2" "RRM2" "ZWINT"
## [7] "NRCAM" "EEF1A2" "ATF5" "PHC2" "SNRPE" "UCP2"
## [13] "PCNA" "EZH2" "CDC20" "MPHOSPH9" "PAFAH1B3" "TYMS"
## [19] "MTHFD2" "RRM2" "ZWINT" "NRCAM" "EEF1A2" "ATF5"
## [25] "PHC2" "SNRPE" "UCP2" "PCNA" "EZH2" "CDC20"
## [1] "TCF3-KD"
## [1] "Performing jaccard test with 'bootstrap' method"
## [1] 0.594

## [1] "INA" "BEX1" "RUNDC3A" "CNTN2" "GDAP1" "BUB1B" "MTHFD2"
## [8] "MYO15A" "MYT1" "NUP210" "ACTL6B" "PLEKHA6" "BRINP1" "EEF1A2"
## [15] "ST8SIA3" "UNC13A" "BCL2L11" "RABL6" "AP3B2" "DONSON" "ACVR2B"
## [22] "ARNT2" "PCNA" "NCAM1" "CSRNP3" "FCHSD2" "CDC20" "INA"
## [29] "BEX1" "RUNDC3A" "CNTN2" "GDAP1" "BUB1B" "MTHFD2" "MYO15A"
## [36] "MYT1" "NUP210" "ACTL6B" "PLEKHA6" "BRINP1" "EEF1A2" "ST8SIA3"
## [43] "UNC13A" "BCL2L11" "RABL6" "AP3B2" "DONSON" "ACVR2B" "ARNT2"
## [50] "PCNA" "NCAM1" "CSRNP3" "FCHSD2" "CDC20"
## [1] "TCF4-KD"
## [1] "Performing jaccard test with 'bootstrap' method"
## [1] 0.017

## [1] "MCM6" "STMN1" "TYMS" "NCAPG" "NCAPG2" "DEPDC1" "BUB1" "LBR"
## [9] "AURKA" "DLGAP5" "MKI67" "KIF20A" "KIF14" "TBP" "SPAG5" "NDC80"
## [17] "CCNB2" "MYBL2" "CDCA8" "SMC4" "DONSON" "CCNA2" "NCAPH" "TACC3"
## [25] "PCNA" "MAP1S" "KIF18A" "KIF2C" "BAHCC1" "CDC20" "SPC25" "TPX2"
## [33] "CDC25B" "CEP55" "TTK" "SLIRP" "MCM6" "STMN1" "TYMS" "NCAPG"
## [41] "NCAPG2" "DEPDC1" "BUB1" "LBR" "AURKA" "DLGAP5" "MKI67" "KIF20A"
## [49] "KIF14" "TBP" "SPAG5" "NDC80" "CCNB2" "MYBL2" "CDCA8" "SMC4"
## [57] "DONSON" "CCNA2" "NCAPH" "TACC3" "PCNA" "MAP1S" "KIF18A" "KIF2C"
## [65] "BAHCC1" "CDC20" "SPC25" "TPX2" "CDC25B" "CEP55" "TTK" "SLIRP"
## [1] "WNT5A-OE"
## [1] "Performing jaccard test with 'bootstrap' method"
## [1] 0.539

## [1] "FEN1" "MCM6" "PARP1" "UBE2S" "PAFAH1B3" "MCM2"
## [7] "STMN1" "FANCI" "NETO2" "RNASEH2A" "TYMS" "GINS2"
## [13] "CDC45" "TIMELESS" "NCAPG" "RRM2" "ESPL1" "TRIP13"
## [19] "NCAPG2" "DEPDC1" "NEK2" "CDC25A" "CTPS1" "LPCAT1"
## [25] "FOXM1" "BUB1" "LBR" "AURKA" "ZWINT" "DLGAP5"
## [31] "CDC7" "HJURP" "GINS1" "LIG1" "H2AFX" "MCM10"
## [37] "MKI67" "KIF4A" "UBE2C" "KIF20A" "DTYMK" "GTSE1"
## [43] "TRMT5" "FBXO5" "KIF14" "CENPE" "CKS2" "CENPM"
## [49] "SPAG5" "CCNB1" "BIRC5" "NDC80" "CCNB2" "MELK"
## [55] "KIF18B" "TOP2A" "MYBL2" "ASF1B" "PRC1" "CDCA8"
## [61] "SMC4" "XPO6" "OIP5" "CCNA2" "NEIL3" "PTTG1"
## [67] "POLE2" "HMMR" "NCAPH" "TACC3" "PCNA" "PSRC1"
## [73] "KIF18A" "KIF2C" "SNRNP25" "EZH2" "ASPM" "MCM5"
## [79] "CDC20" "SPC25" "TPX2" "CDC25B" "CEP55" "TTK"
## [85] "FEN1" "MCM6" "PARP1" "UBE2S" "PAFAH1B3" "MCM2"
## [91] "STMN1" "FANCI" "NETO2" "RNASEH2A" "TYMS" "GINS2"
## [97] "CDC45" "TIMELESS" "NCAPG" "RRM2" "ESPL1" "TRIP13"
## [103] "NCAPG2" "DEPDC1" "NEK2" "CDC25A" "CTPS1" "LPCAT1"
## [109] "FOXM1" "BUB1" "LBR" "AURKA" "ZWINT" "DLGAP5"
## [115] "CDC7" "HJURP" "GINS1" "LIG1" "H2AFX" "MCM10"
## [121] "MKI67" "KIF4A" "UBE2C" "KIF20A" "DTYMK" "GTSE1"
## [127] "TRMT5" "FBXO5" "KIF14" "CENPE" "CKS2" "CENPM"
## [133] "SPAG5" "CCNB1" "BIRC5" "NDC80" "CCNB2" "MELK"
## [139] "KIF18B" "TOP2A" "MYBL2" "ASF1B" "PRC1" "CDCA8"
## [145] "SMC4" "XPO6" "OIP5" "CCNA2" "NEIL3" "PTTG1"
## [151] "POLE2" "HMMR" "NCAPH" "TACC3" "PCNA" "PSRC1"
## [157] "KIF18A" "KIF2C" "SNRNP25" "EZH2" "ASPM" "MCM5"
## [163] "CDC20" "SPC25" "TPX2" "CDC25B" "CEP55" "TTK"
## [1] "pyrvinium"
## [1] "Performing jaccard test with 'bootstrap' method"
## [1] 0.001

Performing pairwise
intersection on overlapped genes in different Wnt-perturbing
conditions
Organizing data
fset<-unique(unlist(annotate_genes))
#pairwise intersection
conditions<-unlist(lapply(RDS_FILES, function(x) strsplit(strsplit(x, "/")[[1]][10], "_")[[1]][1]))
combs<-combn(length(conditions), 2)
summary<-data.frame()
for (i in 1:ncol(combs)){
pair_id<-combs[,i]
set1<-annotate_genes[[pair_id[1]]]
set2<-annotate_genes[[pair_id[2]]]
set1id<-conditions[pair_id[1]]
set2id<-conditions[pair_id[2]]
int<-intersect(set1, set2)
df<-data.frame(set1 = set1id, set2 = set2id, intersection = paste(int, collapse = "/"), gene_count = length(int))
#df<-data.frame(set1 = set1id, set2 = set2id, intersection = int, gene_count = length(int))
summary<-rbind(summary, df)
}
summary$set1<-as.factor(summary$set1)
summary$set2<-as.factor(summary$set2)
Generating circos
plot from the pariwise intersection
### You need several libraries
library(circlize)
library(migest)
library(dplyr)
#pdf(file = "/xdisk/mpadi/jiawenyang/result/merkel_cell/pp_in_mcc_paper/figure/L1000_top_ranked_overlaped_genes_in_Wnt_related_conditions_circos_plot.pdf", height =24 , width =22)
#pdf(file = "/xdisk/mpadi/jiawenyang/result/merkel_cell/pp_in_mcc_paper/figure/L1000_top_ranked_overlaped_genes_in_Wnt_inhibitors_circos_plot.pdf", height =20 , width =16)
### Make data
colors<-brewer.pal(n = length(conditions), "Paired")
df1_new<-data.frame(order = 1:length(conditions),
conditions = conditions,
rcol = colors,
lcol = paste0(colors, "65"),
xmin = rep(0, length(conditions)),
xmax = unlist(lapply(annotate_genes, function(x) length(x))))
### Plot sectors (outer part)
par(mar=rep(0,4))
circos.clear()
### Basic circos graphic parameters
circos.par(cell.padding=c(0,0,0,0), track.margin=c(0,0.15), start.degree = 90, gap.degree = 4)
### Sector details
circos.initialize(factors = df1_new$conditions, xlim = cbind(df1_new$xmin, df1_new$xmax))
### Plot sectors
circos.trackPlotRegion(ylim = c(0, 1), factors = df1_new$conditions, track.height=0.1,
#panel.fun for each sector
panel.fun = function(x, y) {
#select details of current sector
name = get.cell.meta.data("sector.index")
i = get.cell.meta.data("sector.numeric.index")
xlim = get.cell.meta.data("xlim")
ylim = get.cell.meta.data("ylim")
#text direction (dd) and adjusmtents (aa)
theta = circlize(mean(xlim), 1.3)[1, 1] %% 360
dd <- ifelse(theta < 90 || theta > 270, "clockwise", "reverse.clockwise")
aa = c(1, 0.5)
if(theta < 90 || theta > 270) aa = c(0, 0.5)
#plot conditions labels
circos.text(x=mean(xlim), y=1.7, labels=name, facing = dd, cex=1.7, font=2, adj = aa)
#plot main sector
circos.rect(xleft=xlim[1], ybottom=ylim[1], xright=xlim[2], ytop=ylim[2],
col = df1_new$rcol[i], border=df1_new$rcol[i])
#white line all the way around
#circos.rect(xleft=xlim[1], ybottom=0.3, xright=xlim[2], ytop=0.32, col = "white", border = "white")
#plot axis
circos.axis(labels.cex=1.5, direction = "outside", major.at=seq(from=0,to=floor(df1_new$xmax)[i], by = floor(df1_new$xmax)[i])
,minor.ticks=0
)
})
### Plot links (inner part)
### Add sum values to df1, marking the x-position of the first links
### out (sum1) and in (sum2). Updated for further links in loop below.
df1_new$sum1 <- df1_new$xmax
df1_new$sum2 <- numeric(nrow(df1_new))
df1_new$genes <- "/NA"
#df1_new$conditions <- factor(df1_new$conditions, levels = df1_new$conditions)
### Create a data.frame of the flow matrix sorted by flow size, to allow largest flow plotted first
df2_new<-summary[, c("set1", "set2", "gene_count", "intersection")]
df2_new<-arrange(df2_new, desc(gene_count))
### Plot links
for(k in 1:nrow(df2_new)){
#i,j reference of flow matrix
i<-match(df2_new$set1[k],df1_new$conditions)
j<-match(df2_new$set2[k],df1_new$conditions)
genes<-strsplit(subset(df2_new, set1 == df1_new$conditions[i] & set2 == df1_new$conditions[j])[,"intersection"], "/")[[1]]
#plot link
circos.link(sector.index1=df1_new$conditions[i],
point1=c(df1_new$sum2[i]-length(intersect(genes, strsplit(df1_new$genes[i], "/")[[1]])), df1_new$sum2[i] + df2_new$gene_count[k]),
sector.index2=df1_new$conditions[j],
point2=c(df1_new$sum2[j]-length(intersect(genes, strsplit(df1_new$genes[j], "/")[[1]])), df1_new$sum2[j] + df2_new$gene_count[k]),
col = df1_new$lcol[i], rou1 = 0.72, rou2 = 0.72)
#df1_new$genes[i]<-paste0(df1_new$genes[i], "/", paste(genes, collapse = "/"))
#df1_new$genes[j]<-paste0(df1_new$genes[j], "/", paste(genes, collapse = "/"))
df1_new$sum2[i]<-df1_new$sum2[i] + df2_new$gene_count[k] - length(intersect(genes, strsplit(df1_new$genes[i], "/")[[1]]))
df1_new$sum2[j]<-df1_new$sum2[j] + df2_new$gene_count[k] - length(intersect(genes, strsplit(df1_new$genes[j], "/")[[1]]))
df1_new$genes[i]<-paste(unique(c(strsplit(df1_new$genes[i], "/")[[1]], genes)), collapse = "/")
df1_new$genes[j]<-paste(unique(c(strsplit(df1_new$genes[j], "/")[[1]], genes)), collapse = "/")
}

#dev.off()